Imports and installations
#necessary downloads
!apt install --allow-change-held-packages libcudnn8=8.6.0.163-1+cuda11.8
!pip install -U tensorflow_text tensorflow tensorflow_datasets
!pip install einops
#!pip uninstall -y tensorflow estimator keras
#necessary imports
import concurrent.futures
import collections
import dataclasses
import hashlib
import itertools
import json
import math
import os
import pathlib
import random
import re
import string
import time
import urllib.request
import einops
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import requests
import tqdm
from google.colab import files
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_datasets as tfds
Data importing and preprocessing
def flickr8k(path='flickr8k'):
path = pathlib.Path(path)
if len(list(path.rglob('*'))) < 16197:
tf.keras.utils.get_file(
origin='https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip',
cache_dir='.',
cache_subdir=path,
extract=True)
tf.keras.utils.get_file(
origin='https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip',
cache_dir='.',
cache_subdir=path,
extract=True)
captions = (path/"Flickr8k.token.txt").read_text().splitlines()
captions = (line.split('\t') for line in captions)
captions = ((fname.split('#')[0], caption) for (fname, caption) in captions)
cap_dict = collections.defaultdict(list)
for fname, cap in captions:
cap_dict[fname].append(cap)
train_files = (path/'Flickr_8k.trainImages.txt').read_text().splitlines()
train_captions = [(str(path/'Flicker8k_Dataset'/fname), cap_dict[fname]) for fname in train_files]
test_files = (path/'Flickr_8k.testImages.txt').read_text().splitlines()
test_captions = [(str(path/'Flicker8k_Dataset'/fname), cap_dict[fname]) for fname in test_files]
train_ds = tf.data.experimental.from_list(train_captions)
test_ds = tf.data.experimental.from_list(test_captions)
return train_ds, test_ds
#Loading our dataset
train_raw, test_raw = flickr8k()
Downloading data from https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip 1115419746/1115419746 [==============================] - 16s 0us/step Downloading data from https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip 2340801/2340801 [==============================] - 0s 0us/step
Image feature extractor Mobilenet¶
IMAGE_SHAPE=(224, 224, 3)
mobilenet = tf.keras.applications.MobileNetV3Small(
input_shape=IMAGE_SHAPE,
include_top=False,
include_preprocessing=True)
mobilenet.trainable=False
def load_image(image_path):
img = tf.io.read_file(image_path)
img = tf.io.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMAGE_SHAPE[:-1])
return img
Text tokenizer
def standardize(s):
s = tf.strings.lower(s)
s = tf.strings.regex_replace(s, f'[{re.escape(string.punctuation)}]', '')
s = tf.strings.join(['[START]', s, '[END]'], separator=' ')
return s
# Use the top 5000 words for a vocabulary.
vocabulary_size = 5000
tokenizer = tf.keras.layers.TextVectorization(
max_tokens=vocabulary_size,
standardize=standardize,
ragged=True)
# Learn the vocabulary from the caption data.
tokenizer.adapt(train_raw.map(lambda fp,txt: txt).unbatch().batch(1024))
# Create mappings for words to indices and indices to words.
word_to_index = tf.keras.layers.StringLookup(
mask_token="",
vocabulary=tokenizer.get_vocabulary())
index_to_word = tf.keras.layers.StringLookup(
mask_token="",
vocabulary=tokenizer.get_vocabulary(),
invert=True)
Prepare our datasets¶
def match_shapes(images, captions):
caption_shape = einops.parse_shape(captions, 'b c')
captions = einops.rearrange(captions, 'b c -> (b c)')
images = einops.repeat(
images, 'b ... -> (b c) ...',
c = caption_shape['c'])
return images, captions
for ex_paths, ex_captions in train_raw.batch(32).take(1):
break
print('image paths:', ex_paths.shape)
print('captions:', ex_captions.shape)
print()
ex_paths, ex_captions = match_shapes(images=ex_paths, captions=ex_captions)
print('image_paths:', ex_paths.shape)
print('captions:', ex_captions.shape)
image paths: (32,) captions: (32, 5) image_paths: (160,) captions: (160,)
To be compatible with keras training the dataset should contain (inputs, labels) pairs. For text generation the tokens are both an input and the labels, shifted by one step. This function will convert an (images, texts) pair to an ((images, input_tokens), label_tokens) pair:
def prepare_txt(imgs, txts):
tokens = tokenizer(txts)
input_tokens = tokens[..., :-1]
label_tokens = tokens[..., 1:]
return (imgs, input_tokens), label_tokens
This function adds operations to a dataset. The steps are:
- Load the images (and ignore images that fail to load).
- Replicate images to match the number of captions.
- Shuffle and rebatch the
image, captionpairs. - Tokenize the text, shift the tokens and add
label_tokens. - Convert the text from a
RaggedTensorrepresentation to padded denseTensorrepresentation.
def prepare_dataset(ds, tokenizer, batch_size=32, shuffle_buffer=1000):
# Load the images and make batches.
ds = (ds
.shuffle(10000)
.map(lambda path, caption: (load_image(path), caption))
.apply(tf.data.experimental.ignore_errors())
.batch(batch_size))
def to_tensor(inputs, labels):
(images, in_tok), out_tok = inputs, labels
return (images, in_tok.to_tensor()), out_tok.to_tensor()
return (ds
.map(match_shapes, tf.data.AUTOTUNE)
.unbatch()
.shuffle(shuffle_buffer)
.batch(batch_size)
.map(prepare_txt, tf.data.AUTOTUNE)
.map(to_tensor, tf.data.AUTOTUNE)
)
train_ds = prepare_dataset(train_raw, tokenizer)
train_ds.element_spec
WARNING:tensorflow:From <ipython-input-23-03f5d7fa769a>:6: ignore_errors (from tensorflow.python.data.experimental.ops.error_ops) is deprecated and will be removed in a future version. Instructions for updating: Use `tf.data.Dataset.ignore_errors` instead.
((TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None)), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))
test_ds = prepare_dataset(test_raw, tokenizer)
test_ds.element_spec
((TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None)), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))
Cache the image features¶
def save_dataset(ds, save_path, image_model, tokenizer, shards=10, batch_size=32):
# Load the images and make batches.
ds = (ds
.map(lambda path, caption: (load_image(path), caption))
.apply(tf.data.experimental.ignore_errors())
.batch(batch_size))
# Run the feature extractor on each batch
# Don't do this in a .map, because tf.data runs on the CPU.
def gen():
for (images, captions) in tqdm.tqdm(ds):
feature_maps = image_model(images)
feature_maps, captions = match_shapes(feature_maps, captions)
yield feature_maps, captions
# Wrap the generator in a new tf.data.Dataset.
new_ds = tf.data.Dataset.from_generator(
gen,
output_signature=(
tf.TensorSpec(shape=image_model.output_shape),
tf.TensorSpec(shape=(None,), dtype=tf.string)))
# Apply the tokenization
new_ds = (new_ds
.map(prepare_txt, tf.data.AUTOTUNE)
.unbatch()
.shuffle(1000))
# Save the dataset into shard files.
def shard_func(i, item):
return i % shards
new_ds.enumerate().save(save_path, shard_func=shard_func)
def load_dataset(save_path, batch_size=32, shuffle=1000, cycle_length=2):
def custom_reader_func(datasets):
datasets = datasets.shuffle(1000)
return datasets.interleave(lambda x: x, cycle_length=cycle_length)
ds = tf.data.Dataset.load(save_path, reader_func=custom_reader_func)
def drop_index(i, x):
return x
ds = (ds
.map(drop_index, tf.data.AUTOTUNE)
.shuffle(shuffle)
.padded_batch(batch_size)
.prefetch(tf.data.AUTOTUNE))
return ds
save_dataset(train_raw, 'train_cache', mobilenet, tokenizer)
save_dataset(test_raw, 'test_cache', mobilenet, tokenizer)
188it [02:42, 1.16it/s] 32it [00:41, 1.29s/it]
Data ready for training¶
After those preprocessing steps, here are the datasets:
train_ds = load_dataset('train_cache')
test_ds = load_dataset('test_cache')
train_ds.element_spec
((TensorSpec(shape=(None, 7, 7, 576), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None)), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))
The dataset now returns (input, label) pairs suitable for training with keras. The inputs are (images, input_tokens) pairs. The images have been processed with the feature-extractor model. For each location in the input_tokens the model looks at the text so far and tries to predict the next which is lined up at the same location in the labels.
for (inputs, ex_labels) in train_ds.take(1):
(ex_img, ex_in_tok) = inputs
print(ex_img.shape)
print(ex_in_tok.shape)
print(ex_labels.shape)
(32, 7, 7, 576) (32, 23) (32, 23)
The input tokens and the labels are the same, just shifted by 1 step:
print(ex_in_tok[0].numpy())
print(ex_labels[0].numpy())
[ 3 185 8 29 273 12 49 7 6 276 13 2 394 3139
22 2 97 0 0 0 0 0 0]
[ 185 8 29 273 12 49 7 6 276 13 2 394 3139 22
2 97 4 0 0 0 0 0 0]
train_ds.take(1)
<_TakeDataset element_spec=((TensorSpec(shape=(None, 7, 7, 576), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None)), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>
A Transformer decoder model¶
|
The model will be implemented in three main parts:
- Input - The token embedding and positional encoding (
SeqEmbedding). - Decoder - A stack of transformer decoder layers (
DecoderLayer) where each contains:- A causal self attention later (
CausalSelfAttention), where each output location can attend to the output so far. - A cross attention layer (
CrossAttention) where each output location can attend to the input image. - A feed forward network (
FeedForward) layer which further processes each output location independently.
- A causal self attention later (
- Output - A multiclass-classification over the output vocabulary.
Input¶
class SeqEmbedding(tf.keras.layers.Layer):
def __init__(self, vocab_size, max_length, depth):
super().__init__()
self.pos_embedding = tf.keras.layers.Embedding(input_dim=max_length, output_dim=depth)
self.token_embedding = tf.keras.layers.Embedding(
input_dim=vocab_size,
output_dim=depth,
mask_zero=True)
self.add = tf.keras.layers.Add()
def call(self, seq):
seq = self.token_embedding(seq) # (batch, seq, depth)
x = tf.range(tf.shape(seq)[1]) # (seq)
x = x[tf.newaxis, :] # (1, seq)
x = self.pos_embedding(x) # (1, seq, depth)
return self.add([seq,x])
Decoder¶
class CausalSelfAttention(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super().__init__()
self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
# Use Add instead of + so the keras mask propagates through.
self.add = tf.keras.layers.Add()
self.layernorm = tf.keras.layers.LayerNormalization()
def call(self, x):
attn = self.mha(query=x, value=x,
use_causal_mask=True)
x = self.add([x, attn])
return self.layernorm(x)
The CrossAttention layer is below. Note the use of return_attention_scores.
class CrossAttention(tf.keras.layers.Layer):
def __init__(self,**kwargs):
super().__init__()
self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
self.add = tf.keras.layers.Add()
self.layernorm = tf.keras.layers.LayerNormalization()
def call(self, x, y, **kwargs):
attn, attention_scores = self.mha(
query=x, value=y,
return_attention_scores=True)
self.last_attention_scores = attention_scores
x = self.add([x, attn])
return self.layernorm(x)
The FeedForward layer is below. Remember that a layers.Dense layer is applied to the last axis of the input. The input will have a shape of (batch, sequence, channels), so it automatically applies pointwise across the batch and sequence axes.
class FeedForward(tf.keras.layers.Layer):
def __init__(self, units, dropout_rate=0.1):
super().__init__()
self.seq = tf.keras.Sequential([
tf.keras.layers.Dense(units=2*units, activation='relu'),
tf.keras.layers.Dense(units=units),
tf.keras.layers.Dropout(rate=dropout_rate),
])
self.layernorm = tf.keras.layers.LayerNormalization()
def call(self, x):
x = x + self.seq(x)
return self.layernorm(x)
Next arrange these three layers into a larger DecoderLayer. Each decoder layer applies the three smaller layers in sequence. After each sublayer the shape of out_seq is (batch, sequence, channels). The decoder layer also returns the attention_scores for later visualizations.
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self, units, num_heads=1, dropout_rate=0.1):
super().__init__()
self.self_attention = CausalSelfAttention(num_heads=num_heads,
key_dim=units,
dropout=dropout_rate)
self.cross_attention = CrossAttention(num_heads=num_heads,
key_dim=units,
dropout=dropout_rate)
self.ff = FeedForward(units=units, dropout_rate=dropout_rate)
def call(self, inputs, training=False):
in_seq, out_seq = inputs
# Text input
out_seq = self.self_attention(out_seq)
out_seq = self.cross_attention(out_seq, in_seq)
self.last_attention_scores = self.cross_attention.last_attention_scores
out_seq = self.ff(out_seq)
return out_seq
Output¶
Handle bad tokens:
Smart initialization:
#@title
class TokenOutput(tf.keras.layers.Layer):
def __init__(self, tokenizer, banned_tokens=('', '[UNK]', '[START]'), **kwargs):
super().__init__()
self.dense = tf.keras.layers.Dense(
units=tokenizer.vocabulary_size(), **kwargs)
self.tokenizer = tokenizer
self.banned_tokens = banned_tokens
self.bias = None
def adapt(self, ds):
counts = collections.Counter()
vocab_dict = {name: id
for id, name in enumerate(self.tokenizer.get_vocabulary())}
for tokens in tqdm.tqdm(ds):
counts.update(tokens.numpy().flatten())
counts_arr = np.zeros(shape=(self.tokenizer.vocabulary_size(),))
counts_arr[np.array(list(counts.keys()), dtype=np.int32)] = list(counts.values())
counts_arr = counts_arr[:]
for token in self.banned_tokens:
counts_arr[vocab_dict[token]] = 0
total = counts_arr.sum()
p = counts_arr/total
p[counts_arr==0] = 1.0
log_p = np.log(p) # log(1) == 0
entropy = -(log_p*p).sum()
print()
print(f"Uniform entropy: {np.log(self.tokenizer.vocabulary_size()):0.2f}")
print(f"Marginal entropy: {entropy:0.2f}")
self.bias = log_p
self.bias[counts_arr==0] = -1e9
def call(self, x):
x = self.dense(x)
# TODO(b/250038731): Fix this.
# An Add layer doesn't work because of the different shapes.
# This clears the mask, that's okay because it prevents keras from rescaling
# the losses.
return x + self.bias
The smart initialization will significantly reduce the initial loss:
output_layer = TokenOutput(tokenizer, banned_tokens=('', '[UNK]', '[START]'))
# This might run a little faster if the dataset didn't also have to load the image data.
output_layer.adapt(train_ds.map(lambda inputs, labels: labels))
100%|āāāāāāāāāā| 938/938 [00:17<00:00, 52.35it/s]
Uniform entropy: 8.52 Marginal entropy: 5.29
Build the model¶
To build the model, you need to combine several parts:
- The image
feature_extractorand the texttokenizerand. - The
seq_embeddinglayer, to convert batches of token-IDs to vectors(batch, sequence, channels). - The stack of
DecoderLayerslayers that will process the text and image data. - The
output_layerwhich returns a pointwise prediction of what the next word should be.
class Captioner(tf.keras.Model):
@classmethod
def add_method(cls, fun):
setattr(cls, fun.__name__, fun)
return fun
def __init__(self, tokenizer, feature_extractor, output_layer, num_layers=1,
units=256, max_length=50, num_heads=1, dropout_rate=0.1):
super().__init__()
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.word_to_index = tf.keras.layers.StringLookup(
mask_token="",
vocabulary=tokenizer.get_vocabulary())
self.index_to_word = tf.keras.layers.StringLookup(
mask_token="",
vocabulary=tokenizer.get_vocabulary(),
invert=True)
self.seq_embedding = SeqEmbedding(
vocab_size=tokenizer.vocabulary_size(),
depth=units,
max_length=max_length)
self.decoder_layers = [
DecoderLayer(units, num_heads=num_heads, dropout_rate=dropout_rate)
for n in range(num_layers)]
self.output_layer = output_layer
@Captioner.add_method
def call(self, inputs):
image, txt = inputs
if image.shape[-1] == 3:
# Apply the feature-extractor, if you get an RGB image.
image = self.feature_extractor(image)
# Flatten the feature map
image = einops.rearrange(image, 'b h w c -> b (h w) c')
if txt.dtype == tf.string:
# Apply the tokenizer if you get string inputs.
txt = tokenizer(txt)
txt = self.seq_embedding(txt)
# Look at the image
for dec_layer in self.decoder_layers:
txt = dec_layer(inputs=(image, txt))
txt = self.output_layer(txt)
return txt
model = Captioner(tokenizer, feature_extractor=mobilenet, output_layer=output_layer,
units=256, dropout_rate=0.5, num_layers=2, num_heads=2)
@Captioner.add_method
def simple_gen(self, image, temperature=1):
initial = self.word_to_index([['[START]']]) # (batch, sequence)
img_features = self.feature_extractor(image[tf.newaxis, ...])
tokens = initial # (batch, sequence)
for n in range(50):
preds = self((img_features, tokens)).numpy() # (batch, sequence, vocab)
preds = preds[:,-1, :] #(batch, vocab)
if temperature==0:
next = tf.argmax(preds, axis=-1)[:, tf.newaxis] # (batch, 1)
else:
next = tf.random.categorical(preds/temperature, num_samples=1) # (batch, 1)
tokens = tf.concat([tokens, next], axis=1) # (batch, sequence)
if next[0] == self.word_to_index('[END]'):
break
words = index_to_word(tokens[0, 1:-1])
result = tf.strings.reduce_join(words, axis=-1, separator=' ')
return result.numpy().decode()
Train¶
Losses and metrics¶
Here's an implementation of a masked loss and accuracy:
When calculating the mask for the loss, note the loss < 1e8. This term discards the artificial, impossibly high losses for the banned_tokens.
def masked_loss(labels, preds):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, preds)
mask = (labels != 0) & (loss < 1e8)
mask = tf.cast(mask, loss.dtype)
loss = loss*mask
loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
return loss
def masked_acc(labels, preds):
mask = tf.cast(labels!=0, tf.float32)
preds = tf.argmax(preds, axis=-1)
labels = tf.cast(labels, tf.int64)
match = tf.cast(preds == labels, mask.dtype)
acc = tf.reduce_sum(match*mask)/tf.reduce_sum(mask)
return acc
Callbacks¶
For feedback during training setup a keras.callbacks.Callback to generate some captions for the surfer image at the end of each epoch.
class GenerateText(tf.keras.callbacks.Callback):
def __init__(self):
#image_url = 'https://images.pexels.com/photos/7092613/pexels-photo-7092613.jpeg?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=1'
image_path = "/content/Profteachingexample.jpg"
self.image = load_image(image_path)
def on_epoch_end(self, epochs=None, logs=None):
print()
print()
for t in (0.0, 0.5, 1.0):
result = self.model.simple_gen(self.image, temperature=t)
print(result)
print()
callbacks = [
GenerateText(),
tf.keras.callbacks.EarlyStopping(
patience=3, restore_best_weights=True)]
Train¶
Configure and execute the training.
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
loss=masked_loss,
metrics=[masked_acc])
history = model.fit(
train_ds.repeat(),
steps_per_epoch=100,
validation_data=test_ds.repeat(),
validation_steps=20,
epochs=100,
callbacks=callbacks)
Epoch 1/100 100/100 [==============================] - ETA: 0s - loss: 5.0138 - masked_acc: 0.2003 a man in a a man in a a man in a woman a red in a man two purple in the with and teammates stand 100/100 [==============================] - 83s 738ms/step - loss: 5.0138 - masked_acc: 0.2003 - val_loss: 4.7083 - val_masked_acc: 0.2366 Epoch 2/100 100/100 [==============================] - ETA: 0s - loss: 4.6523 - masked_acc: 0.2535 a man in a red and a red a man is is of a rock van man climbs a green handicapped the women 100/100 [==============================] - 70s 696ms/step - loss: 4.6523 - masked_acc: 0.2535 - val_loss: 4.4058 - val_masked_acc: 0.2643 Epoch 3/100 100/100 [==============================] - ETA: 0s - loss: 4.4321 - masked_acc: 0.2752 a man in a red and a red and a man a woman and a large shirt and a blue a puppies outstretched kayaking shaking object and glacier dressed with two 100/100 [==============================] - 76s 765ms/step - loss: 4.4321 - masked_acc: 0.2752 - val_loss: 4.2230 - val_masked_acc: 0.2837 Epoch 4/100 100/100 [==============================] - ETA: 0s - loss: 4.2665 - masked_acc: 0.2901 a man in a red shirt and a woman in a man a man is standing on a blue and a boat a boy jumping a little girl jacket women near to orange shirt 100/100 [==============================] - 73s 731ms/step - loss: 4.2665 - masked_acc: 0.2901 - val_loss: 4.0793 - val_masked_acc: 0.2886 Epoch 5/100 100/100 [==============================] - ETA: 0s - loss: 4.1226 - masked_acc: 0.3062 a man in a red shirt and a woman in a red and a man and a man a man wearing a pink and a yellow shirt and white dog a man crosscountry wearing and and sides 100/100 [==============================] - 73s 729ms/step - loss: 4.1226 - masked_acc: 0.3062 - val_loss: 3.8932 - val_masked_acc: 0.3214 Epoch 6/100 100/100 [==============================] - ETA: 0s - loss: 4.0149 - masked_acc: 0.3151 a man in a woman and a woman in a woman in a man a man is standing in a white and blue and a man in a black and white shirt a man 100/100 [==============================] - 71s 712ms/step - loss: 4.0149 - masked_acc: 0.3151 - val_loss: 3.9269 - val_masked_acc: 0.3162 Epoch 7/100 100/100 [==============================] - ETA: 0s - loss: 3.9492 - masked_acc: 0.3219 a man in a black and a black and a man in a man and a man and a black and a black and a man and a man and a man and a man is standing in a man and a man and a man in a man a man in a yellow shirt and a red shirt and a woman and a group of a red shirt front of a man as a woman is pink back 100/100 [==============================] - 84s 845ms/step - loss: 3.9492 - masked_acc: 0.3219 - val_loss: 3.7461 - val_masked_acc: 0.3290 Epoch 8/100 100/100 [==============================] - ETA: 0s - loss: 3.8674 - masked_acc: 0.3256 a man in a red shirt and a woman in a red shirt and a white shirt a woman in a the shirt is standing on a red shirt and a people a young trail in the dress is in the player for the trick 100/100 [==============================] - 74s 738ms/step - loss: 3.8674 - masked_acc: 0.3256 - val_loss: 3.7841 - val_masked_acc: 0.3261 Epoch 9/100 100/100 [==============================] - ETA: 0s - loss: 3.7917 - masked_acc: 0.3327 a man in a red shirt and a woman in a red shirt and a white shirt and a red shirt and a man and a man and a man and a woman and a white shirt and white and a man and a woman and a man in a group of people are playing in a trampoline the four mother are are players at the takes night 100/100 [==============================] - 76s 764ms/step - loss: 3.7917 - masked_acc: 0.3327 - val_loss: 3.6401 - val_masked_acc: 0.3435 Epoch 10/100 100/100 [==============================] - ETA: 0s - loss: 3.7249 - masked_acc: 0.3418 a man in a black shirt and a woman in a black and a red shirt and white shirt and a woman in a woman in a man and a man in a black and white shirt and white and a woman in a woman with a woman in a woman in a blue shirt and black shirt is standing in front of a blue shirt two girls one woman standing on his bmx crowd of a man posts 100/100 [==============================] - 80s 806ms/step - loss: 3.7249 - masked_acc: 0.3418 - val_loss: 3.7016 - val_masked_acc: 0.3313 Epoch 11/100 100/100 [==============================] - ETA: 0s - loss: 3.6800 - masked_acc: 0.3435 a man in a black and a woman and a woman in a black and a man and a black and a woman and a woman and a man and a man in a woman and a man in a woman in a man and a woman and a three people are standing on a picture of an old man in front of a white three girls through some road standing on the camera 100/100 [==============================] - 75s 749ms/step - loss: 3.6800 - masked_acc: 0.3435 - val_loss: 3.5332 - val_masked_acc: 0.3464 Epoch 12/100 100/100 [==============================] - ETA: 0s - loss: 3.6286 - masked_acc: 0.3468 a man in a black and a woman in a white shirt and a black and a black shirt and a woman and a woman and a man and a man in a black and white shirt and white and a woman and a woman and a woman in two people are sitting on a sidewalk three skateboarder are carrying a bicycle in a face with a face 100/100 [==============================] - 83s 834ms/step - loss: 3.6286 - masked_acc: 0.3468 - val_loss: 3.4627 - val_masked_acc: 0.3466 Epoch 13/100 100/100 [==============================] - ETA: 0s - loss: 3.6015 - masked_acc: 0.3534 a man in a black shirt and a woman and a woman in a white shirt and a woman a man in a white hat and smiling a woman wearing a red and a striped shirt standing of laughing after a house 100/100 [==============================] - 76s 762ms/step - loss: 3.6015 - masked_acc: 0.3534 - val_loss: 3.4537 - val_masked_acc: 0.3540 Epoch 14/100 100/100 [==============================] - ETA: 0s - loss: 3.5384 - masked_acc: 0.3543 a man in a black shirt and a woman in a black and a black shirt and white shirt and a woman in a man and a man and a man in a black and white shirt and white shirt and a black shirt and a black shirt is a man in a black helmet is walking up a man in a small white shirt a woman taking a very men in long singing 100/100 [==============================] - 81s 813ms/step - loss: 3.5384 - masked_acc: 0.3543 - val_loss: 3.4021 - val_masked_acc: 0.3610 Epoch 15/100 100/100 [==============================] - ETA: 0s - loss: 3.5169 - masked_acc: 0.3552 a man in a black shirt and a woman in a black shirt and a white shirt and a black shirt and a man in a man in a man is standing in a white shirt and a woman in a man in a black and a woman in a woman wearing a white shirt and white shirt and a woman posing for a building a man catches a city street are face while are wear attempts to a cigarette at a river 100/100 [==============================] - 83s 831ms/step - loss: 3.5169 - masked_acc: 0.3552 - val_loss: 3.3880 - val_masked_acc: 0.3537 Epoch 16/100 100/100 [==============================] - ETA: 0s - loss: 3.4538 - masked_acc: 0.3650 a man in a black shirt and a woman in a black and a white shirt and white shirt and a woman in a man and a man in a man in a black and a man and white shirt and a man and a woman in a black a man standing on a wall a child in in her shirt outfit sitting on a fishing other at a group of a wall 100/100 [==============================] - 75s 756ms/step - loss: 3.4538 - masked_acc: 0.3650 - val_loss: 3.3539 - val_masked_acc: 0.3544 Epoch 17/100 100/100 [==============================] - ETA: 0s - loss: 3.4124 - masked_acc: 0.3660 a man in a black shirt and a man in a black and a man in a white shirt and a man and a man and a man and a man in a man in a man in a man and a man and a man in a man a man and woman in a woman posing for a woman in front of a man in a woman and white shirt are a a black and a picture four people sitting at the flower fire 100/100 [==============================] - 78s 778ms/step - loss: 3.4124 - masked_acc: 0.3660 - val_loss: 3.3390 - val_masked_acc: 0.3609 Epoch 18/100 100/100 [==============================] - ETA: 0s - loss: 3.3904 - masked_acc: 0.3619 a man in a black shirt and a woman is standing on a man in a man in a black and a black and a white shirt and a man is standing in a white shirt a man in a black shirt and black hat is sitting in front of an older man in front of a man in a man is sitting along a man with a large black and white hat a man sits in winter bathroom on a containing a street 100/100 [==============================] - 78s 783ms/step - loss: 3.3904 - masked_acc: 0.3619 - val_loss: 3.3477 - val_masked_acc: 0.3590 Epoch 19/100 100/100 [==============================] - ETA: 0s - loss: 3.3459 - masked_acc: 0.3705 a man in a black shirt and a woman in a black jacket and a man in a man and a man in a white shirt and a man and a man in a white shirt and a woman in a man in a man is standing in a a man and a woman in a woman and white tshirt and a woman in front of a man and sunglasses two people dressed another side near an man and one of street 100/100 [==============================] - 79s 787ms/step - loss: 3.3459 - masked_acc: 0.3705 - val_loss: 3.3411 - val_masked_acc: 0.3623 Epoch 20/100 100/100 [==============================] - ETA: 0s - loss: 3.2832 - masked_acc: 0.3732 a man in a black shirt and a man in a black shirt and a black shirt is standing in a man in a black and a man in a man in a black and a man in a man in a black shirt a man in a black hat and a woman in a black shirt and a man in a man with a man and a pink and a man in a white shirt is sitting on a man on a man in the shirt is sitting on a shirt a man on a smiling of camping mohawk is paddles over a table 100/100 [==============================] - 83s 831ms/step - loss: 3.2832 - masked_acc: 0.3732 - val_loss: 3.2589 - val_masked_acc: 0.3744 Epoch 21/100 100/100 [==============================] - ETA: 0s - loss: 3.2645 - masked_acc: 0.3781 a man in a black shirt and a woman in a black shirt and a woman in a man in a man in a white shirt and a man in a man in a white shirt a man wearing a black shirt and a black hat and a woman with a woman and a man with her woman in a woman a man is looking grocery bus at outside 100/100 [==============================] - 79s 795ms/step - loss: 3.2645 - masked_acc: 0.3781 - val_loss: 3.2582 - val_masked_acc: 0.3616 Epoch 22/100 100/100 [==============================] - ETA: 0s - loss: 3.2652 - masked_acc: 0.3733 a man in a black shirt and a woman in a white shirt and a woman in a black and a white shirt a woman in a black shirt and white shirt is standing on a cellphone a young boy blows white dress is holding jerseys eating looking at a snowy sunny cart in a bench 100/100 [==============================] - 82s 816ms/step - loss: 3.2652 - masked_acc: 0.3733 - val_loss: 3.2275 - val_masked_acc: 0.3729 Epoch 23/100 100/100 [==============================] - ETA: 0s - loss: 3.2089 - masked_acc: 0.3814 a man in a black shirt and a woman in a white shirt and a white shirt is standing in a woman in a white shirt and a man in a black and a white shirt a woman in a black shirt and a woman in a black jacket is standing on a white shirt teenagers playing a man wearing white shirt 100/100 [==============================] - 76s 761ms/step - loss: 3.2089 - masked_acc: 0.3814 - val_loss: 3.2327 - val_masked_acc: 0.3686 Epoch 24/100 100/100 [==============================] - ETA: 0s - loss: 3.1893 - masked_acc: 0.3788 a man in a black shirt is standing on a sidewalk a woman in a black hat is wearing a white shirt with a light someone is holding a hair next to the stop at a crowd 100/100 [==============================] - 75s 754ms/step - loss: 3.1893 - masked_acc: 0.3788 - val_loss: 3.2234 - val_masked_acc: 0.3673 Epoch 25/100 100/100 [==============================] - ETA: 0s - loss: 3.1900 - masked_acc: 0.3844 a man in a black shirt and white shirt is standing on a white shirt and a white shirt a man in a dress and a woman and a woman is sitting on a woman in a white shirt and a white shirt and a man in a bench a man sitting in a black skull hat wearing a surfboard and sunglasses 100/100 [==============================] - 72s 718ms/step - loss: 3.1900 - masked_acc: 0.3844 - val_loss: 3.1768 - val_masked_acc: 0.3722 Epoch 26/100 100/100 [==============================] - ETA: 0s - loss: 3.1686 - masked_acc: 0.3847 a man in a white shirt and a woman in a white shirt is standing in front of a white shirt a man is standing on a a white bench while a man in a white shirt is holding a crowd of a crowd a man smokes a blonde listening to making a batman logo with says american pajamas stops behind her hand 100/100 [==============================] - 75s 753ms/step - loss: 3.1686 - masked_acc: 0.3847 - val_loss: 3.1592 - val_masked_acc: 0.3701 Epoch 27/100 100/100 [==============================] - ETA: 0s - loss: 3.1877 - masked_acc: 0.3822 a man and a woman are standing on a bench a man and two people are standing in a blue and a woman two girls wearing orange jeans 100/100 [==============================] - 66s 665ms/step - loss: 3.1877 - masked_acc: 0.3822 - val_loss: 3.0931 - val_masked_acc: 0.3768 Epoch 28/100 100/100 [==============================] - ETA: 0s - loss: 3.1877 - masked_acc: 0.3800 a man in a black shirt and a woman in a white shirt and a woman in a white shirt and a white shirt and a white shirt and a white shirt and a white shirt is standing in a black and a man in a black jacket a man and a woman are sitting on a bench while a woman and a woman standing in a blue and a crowd the person is outside of some record flags in an old basket 100/100 [==============================] - 73s 728ms/step - loss: 3.1877 - masked_acc: 0.3800 - val_loss: 3.0981 - val_masked_acc: 0.3835 Epoch 29/100 100/100 [==============================] - ETA: 0s - loss: 3.0838 - masked_acc: 0.3891 a man in a black shirt and white shirt and white shirt is standing in front of a crowd a man wearing a blue shirt and a black hat is holding a sign a man wearing sunglasses holds sunglasses looking at a picture in the background 100/100 [==============================] - 70s 701ms/step - loss: 3.0838 - masked_acc: 0.3891 - val_loss: 3.0915 - val_masked_acc: 0.3817 Epoch 30/100 100/100 [==============================] - ETA: 0s - loss: 3.0632 - masked_acc: 0.3940 a man in a black shirt and white shirt and white shirt and white shirt and white shirt and white dog a girl in a black jacket and white shirt and white hat is holding a picture two men gather for a parade at an and tan ocean 100/100 [==============================] - 75s 754ms/step - loss: 3.0632 - masked_acc: 0.3940 - val_loss: 3.0614 - val_masked_acc: 0.3835 Epoch 31/100 100/100 [==============================] - ETA: 0s - loss: 3.0490 - masked_acc: 0.3948 a man and a woman are sitting on a bench a man is standing on a table with a picture of people a man with glasses and standing at the left car 100/100 [==============================] - 72s 717ms/step - loss: 3.0490 - masked_acc: 0.3948 - val_loss: 2.9916 - val_masked_acc: 0.3934 Epoch 32/100 100/100 [==============================] - ETA: 0s - loss: 3.0326 - masked_acc: 0.3944 a man in a black shirt and a woman are standing in a crowd of a man in a crowd a woman wearing a black shirt and a woman smiles several adults and white shirts are smiling at the bed 100/100 [==============================] - 69s 693ms/step - loss: 3.0326 - masked_acc: 0.3944 - val_loss: 3.1274 - val_masked_acc: 0.3729 Epoch 33/100 100/100 [==============================] - ETA: 0s - loss: 3.0132 - masked_acc: 0.3942 a man in a black hat and a woman in a white shirt and a black jacket a man in a black suit looks for a man in a green jacket sunglasses on a gallery 100/100 [==============================] - 68s 681ms/step - loss: 3.0132 - masked_acc: 0.3942 - val_loss: 3.0431 - val_masked_acc: 0.3825 Epoch 34/100 100/100 [==============================] - ETA: 0s - loss: 3.0025 - masked_acc: 0.3968 a man in a black shirt is standing on a table two men stand on the camera while a man in front of a busy street a man in a blue shirt is smiling at the camera 100/100 [==============================] - 68s 680ms/step - loss: 3.0025 - masked_acc: 0.3968 - val_loss: 2.9842 - val_masked_acc: 0.3950 Epoch 35/100 100/100 [==============================] - ETA: 0s - loss: 3.0346 - masked_acc: 0.3923 a man wearing a black jacket and a woman in a white shirt and a white shirt and a woman in a black jacket a man and woman are standing in a black and white shirt two costumes are standing in front of a meadow 100/100 [==============================] - 71s 713ms/step - loss: 3.0346 - masked_acc: 0.3923 - val_loss: 2.9601 - val_masked_acc: 0.3905 Epoch 36/100 100/100 [==============================] - ETA: 0s - loss: 2.9936 - masked_acc: 0.3947 a man in a black shirt and white shirt is standing in a white shirt and a white shirt and white shirt and a woman in a woman in a white dog a woman is standing in a book two women sit at a picture together as he dusk 100/100 [==============================] - 75s 750ms/step - loss: 2.9936 - masked_acc: 0.3947 - val_loss: 3.0080 - val_masked_acc: 0.3809 Epoch 37/100 100/100 [==============================] - ETA: 0s - loss: 2.9474 - masked_acc: 0.3995 a man and a woman are standing in front of a white building a man in a black jacket and a brown dog drinking from a man with a man a man to phone with a sign girl in her hand 100/100 [==============================] - 68s 683ms/step - loss: 2.9474 - masked_acc: 0.3995 - val_loss: 2.9743 - val_masked_acc: 0.3873 Epoch 38/100 100/100 [==============================] - ETA: 0s - loss: 2.9138 - masked_acc: 0.4048 a man in a black shirt and white shirt is standing on a white bench two men in a red snowsuit is standing next to a sign two girls gathered next to the camera 100/100 [==============================] - 68s 678ms/step - loss: 2.9138 - masked_acc: 0.4048 - val_loss: 2.9486 - val_masked_acc: 0.3993 Epoch 39/100 100/100 [==============================] - ETA: 0s - loss: 2.8814 - masked_acc: 0.4071 a man in a black shirt and white shirt is standing in a white shirt and a white shirt a woman in a black jacket and a man in a white cap stands in front of a sign a woman only a purple striped shirt and woman sitting halloween 100/100 [==============================] - 76s 758ms/step - loss: 2.8814 - masked_acc: 0.4071 - val_loss: 3.0344 - val_masked_acc: 0.3872 Epoch 40/100 100/100 [==============================] - ETA: 0s - loss: 2.8617 - masked_acc: 0.4111 a man in a black shirt and a woman in a white shirt is standing in front of a white shirt the man is drinking from a man in a blue shirt and a black shirt three party happily sitting beside a large group of bags touches the table by trees 100/100 [==============================] - 74s 744ms/step - loss: 2.8617 - masked_acc: 0.4111 - val_loss: 2.9666 - val_masked_acc: 0.3905 Epoch 41/100 100/100 [==============================] - ETA: 0s - loss: 2.8488 - masked_acc: 0.4112 a man in a black shirt and a woman in a white shirt and a white shirt is standing in a white shirt and a white shirt and a white shirt and a white shirt is sitting on a woman in a black shirt and a black shirt is a man in a black shirt and white shirt is holding a picture a couple pose for something in front of a woman sits on either puck for a having stadium 100/100 [==============================] - 72s 723ms/step - loss: 2.8488 - masked_acc: 0.4112 - val_loss: 2.9747 - val_masked_acc: 0.3869
Plot the loss and accuracy over the training run:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('CE/token')
plt.legend()
<matplotlib.legend.Legend at 0x7855c84141c0>
plt.plot(history.history['masked_acc'], label='accuracy')
plt.plot(history.history['val_masked_acc'], label='val_accuracy')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('CE/token')
plt.legend()
<matplotlib.legend.Legend at 0x7855c8414280>
Attention plots¶
@Captioner.add_method
def run_and_show_attention(self, image, temperature=0.0):
result_txt = self.simple_gen(image, temperature)
str_tokens = result_txt.split()
str_tokens.append('[END]')
attention_maps = [layer.last_attention_scores for layer in self.decoder_layers]
attention_maps = tf.concat(attention_maps, axis=0)
attention_maps = einops.reduce(
attention_maps,
'batch heads sequence (height width) -> sequence height width',
height=7, width=7,
reduction='mean')
plot_attention_maps(image/255, str_tokens, attention_maps)
t = plt.suptitle(result_txt)
print(result_txt)
t.set_y(1.05)
image_path = "/content/Profteachingexample.jpg"
image = load_image(image_path)
run_and_show_attention(model, image)
a man in a black shirt and white shirt is standing on a white bench
import random
import tensorflow as tf
import matplotlib.pyplot as plt
# Set a seed for reproducibility
seed_value = 42
tf.random.set_seed(seed_value)
random.seed(seed_value)
_, test_dataset = flickr8k()
# Shuffle the test dataset
shuffled_test_dataset = test_dataset.shuffle(buffer_size=10000, seed=seed_value)
# Print and see some random photos from the test dataset with captions
num_photos_to_display = 5
for example in shuffled_test_dataset.take(num_photos_to_display):
photo_path, captions_tensor = example[0], example[1]
# Convert the captions tensor to a numpy array
captions = [caption.numpy().decode('utf-8') for caption in captions_tensor]
# Load and display the original image
img = tf.io.read_file(photo_path)
img = tf.image.decode_jpeg(img, channels=3)
print(photo_path)
plt.imshow(img)
plt.title(f"Photo: {photo_path}")
plt.axis('off')
plt.show()
print("Captions:")
for caption in captions:
print(f" - {caption}")
print("\n")
tf.Tensor(b'flickr8k/Flicker8k_Dataset/486917990_72bd4069af.jpg', shape=(), dtype=string)
Captions: - A girl is climbing a rock wall . - A person climbs a steep mountain . - A person wearing a white hat climbs a rock . - A rock climber ascends . - Someone climbs a rocks . tf.Tensor(b'flickr8k/Flicker8k_Dataset/2621415349_ef1a7e73be.jpg', shape=(), dtype=string)
Captions: - A man in a red baseball cap eats a chip . - a man wearing a red hat has a potato chip in his mouth - A man wearing sunglasses and a red cap putting a chip in his mouth . - A man wearing sunglasses and a red hat is opening his mouth wide and eating a chip . - A man with sunglasses on puts a chip in his mouth . tf.Tensor(b'flickr8k/Flicker8k_Dataset/758921886_55a351dd67.jpg', shape=(), dtype=string)
Captions: - A girl wearing a blue dress is sliding down a tube slide . - A little girl in a white top is inside a concrete tube . - A little girl is sliding down a tunnel smiling . - A little girl smiles as she slides down a tube on a sunny day . - A young girl in inside a tunnel . tf.Tensor(b'flickr8k/Flicker8k_Dataset/3692593096_fbaea67476.jpg', shape=(), dtype=string)
Captions: - Airplane emitting heavy red colored smoke . - An airplane is flying over the mountain trying to extinguish a fire . - A small plane is dropping a red chemical over the mountaintops . - Red spray is being ejected by an orange and white plane flying over the hilltops . - Small red airplane flies over mountaintop dropping red substance over fire . tf.Tensor(b'flickr8k/Flicker8k_Dataset/3217187564_0ffd89dec1.jpg', shape=(), dtype=string)
Captions: - A group of dogs racing . - A number eight racing dog is beating a number four racing dog slightly in a race . - Several dogs wearing muzzles are racing on a track . - There are three dogs wearing numbered jerseys running a race . - Three race dogs run to finish a race .
#The image with attention plots
image_path = "/content/3217187564_0ffd89dec1.jpg"
image = load_image(image_path)
run_and_show_attention(model, image)
a dog is running on a track
#The image of a girl climbing
image_path = "/content/3692593096_fbaea67476.jpg"
image = load_image(image_path)
run_and_show_attention(model, image)
a person in a red shirt is climbing a rocky mountain
from nltk.translate.bleu_score import sentence_bleu
# Assuming references is a list of lists where each inner list contains the reference captions
references = [
['Airplane emitting heavy red colored smoke.'],
['An airplane is flying over the mountain trying to extinguish a fire.'],
['A small plane is dropping a red chemical over the mountaintops.'],
['Red spray is being ejected by an orange and white plane flying over the hilltops.'],
['Small red airplane flies over mountaintop dropping red substance over fire.']
]
# Assuming hypotheses is a list of generated captions
hypotheses = ['a person in a red shirt is climbing a rocky mountain']
# Tokenize the references and hypotheses
references = [[ref[0].split()] for ref in references]
hypotheses = [h.split() for h in hypotheses]
# Calculate BLEU score for each hypothesis-reference pair
individual_scores = [sentence_bleu(references[i], hypotheses[i]) for i in range(len(hypotheses))]
# Calculate the average BLEU score
average_bleu_score = sum(individual_scores) / len(individual_scores)
print("BLEU Score:", average_bleu_score)
print("BLEU individual_scores:", individual_scores)
BLEU Score: 1.0003688322288243e-231 BLEU individual_scores: [1.0003688322288243e-231]
/usr/local/lib/python3.10/dist-packages/nltk/translate/bleu_score.py:552: UserWarning: The hypothesis contains 0 counts of 2-gram overlaps. Therefore the BLEU score evaluates to 0, independently of how many N-gram overlaps of lower order it contains. Consider using lower n-gram order or use SmoothingFunction() warnings.warn(_msg)
#The first image
image_path = "/content/486917990_72bd4069af.jpg"
image = load_image(image_path)
run_and_show_attention(model, image)
a man is climbing a rock wall
from nltk.translate.bleu_score import sentence_bleu
# Assuming references is a list of lists where each inner list contains the reference captions
references = [
['A girl is climbing a rock wall .'],
['A person climbs a steep mountain .'],
['A person wearing a white hat climbs a rock .'],
['A rock climber ascends .'],
['Someone climbs a rocks .']
]
# Assuming hypotheses is a list of generated captions
hypotheses = ['a man is climbing a rock wall']
# Tokenize the references and hypotheses
references = [[ref[0].split()] for ref in references]
hypotheses = [h.split() for h in hypotheses]
# Calculate BLEU score for each hypothesis-reference pair
individual_scores = [sentence_bleu(references[i], hypotheses[i]) for i in range(len(hypotheses))]
# Calculate the average BLEU score
average_bleu_score = sum(individual_scores) / len(individual_scores)
print("BLEU Score:", average_bleu_score)
BLEU Score: 0.5329462628216854